library(data.table)
library(dplyr)
library(zoo)
library(lubridate)
library(ggplot2)
library(readr)
library(plotly)
library(RColorBrewer)
library(grDevices)
library(knitr)
####READING DATA
#Data files used were made by using Python code (file Project_notebook.ipynb) and/or was sent by the client
#Reservations: 2 datafiles
res_view_old <- fread("Reservations_view.csv")[,c(2,3,4,6,7,8,9,14,16,28,31,34,35,36,41)]
res_view <- fread("Res_new.csv")
res_view <- res_view[,c(2,3,4,6,7,8,9,18,20,32,35,38,39,40,45)]
reservations <- merge(res_view_old,res_view,all=TRUE)
reservations <- reservations[!duplicated(reservations$rate_quote_uuid),]
write.csv(reservations,"reservations.csv")
#Rate quotes filtered & dup
#dup - no duplicates by request uuid
#filtered - no duplicates and removed very similar requests (gone through phase 2)
rateq1_filtered <- fread("filtered_rate_quote_1.csv")
rateq2_filtered <- fread("filtered_rate_quote_2.csv")
rateq3_filtered <- fread("filtered_rate_quote_3.csv")
rateq4_filtered <- fread("filtered_rate_quote_4.csv")
rateq_filtered <- merge(merge(merge(rateq1_filtered,rateq2_filtered,all=TRUE),rateq3_filtered,all=TRUE),rateq4_filtered,all=TRUE)
remove(rateq1_filtered,rateq2_filtered,rateq3_filtered,rateq4_filtered)
rateq1_dup <- fread("rate_quote_1_dup1.csv")
rateq2_dup <- fread("rate_quote_2_dup1.csv")
rateq3_dup <- fread("rate_quote_3_dup1.csv")
rateq4_dup <- fread("rate_quote_4_dup1.csv")
rateq_dup <- merge(merge(merge(rateq1_dup,rateq2_dup,all=TRUE),rateq3_dup,all=TRUE),rateq4_dup,all=TRUE)
remove(rateq1_dup,rateq2_dup,rateq3_dup,rateq4_dup)
write.csv(rateq_dup,"rateq_dup.csv")
write.csv(rateq_filtered,"rateq_filtered.csv")
hotdays_rq_nodupl_brokers_byday <- fread("hotdays_rq_nodupl.csv")
hotdays_rq_nodupl_brokers_byweek <- fread("hotdays_rq_nodupl_week.csv")
hotdays_reserv_brokers_byday <- fread("hotdays_reserv_broker.csv")
hotdays_reserv_statustype_byday <- fread("hotdays_reserv_statustype.csv")
hotdays_reserv_acriss_byday <-fread("hotdays_reserv_acriss.csv")
Overview of how many days we have in the rate quote (duplicated) data.
tabel_days <- rateq_dup %>% group_by(as.Date(timestamp)) %>% summarise(min=min(timestamp),max=max(timestamp))
kable(tabel_days,caption = "Rate quotes")
| as.Date(timestamp) | min | max |
|---|---|---|
| 2017-12-01 | 2017-12-01 00:00:39.911 | 2017-12-01 23:59:42.076 |
| 2017-12-02 | 2017-12-02 00:00:21.101 | 2017-12-02 23:59:53.040 |
| 2017-12-03 | 2017-12-03 00:00:04.960 | 2017-12-03 20:57:14.210 |
| 2017-12-05 | 2017-12-05 00:00:00.060 | 2017-12-05 23:59:48.232 |
| 2017-12-06 | 2017-12-06 00:00:01.880 | 2017-12-06 23:59:36.254 |
| 2017-12-07 | 2017-12-07 00:00:06.907 | 2017-12-07 09:24:34.903 |
| 2017-12-11 | 2017-12-11 04:00:28.030 | 2017-12-11 21:27:35.617 |
| 2017-12-12 | 2017-12-12 03:36:12.871 | 2017-12-12 20:19:27.911 |
| 2017-12-13 | 2017-12-13 00:00:00.039 | 2017-12-13 23:59:53.457 |
| 2017-12-14 | 2017-12-14 00:00:08.780 | 2017-12-14 23:59:56.727 |
| 2017-12-15 | 2017-12-15 00:00:01.885 | 2017-12-15 21:19:09.761 |
Removing date 2017-12-07, due to the fact that it contains only data with the time period of 9 hours, which is clearly a lot different from other days.
Overview of the reservations data.
tabel_days2 <- reservations %>% group_by(date=as.Date(rate_request_timestamp)) %>% summarise(count=n())
p_tabel_days2 <- tabel_days2 %>% plot_ly(x=~date,y=~count) %>% add_lines() %>% layout(title="Reservations made")
p_tabel_days2
We can see that there are not many reservations done before 27th of February. For further illustrations, let’s cut this part of the data out.
#Removing date 2017-12-07
rateq_dup <- rateq_dup[!(as.Date(timestamp)=="2017-12-07"),]
rateq_filtered <- rateq_filtered[!(as.Date(timestamp)=="2017-12-07"),]
reservations <- reservations[as.Date(reservations$rate_request_timestamp) >="2017-02-27",]
one <- rateq_dup[,2]
one$datatype <- "Duplicated by uuid"
two <- rateq_filtered[,3]
two$datatype <- "Filtered"
together<- merge(one,two,all=TRUE)
#REQUESTS-----------------
#in case a log scale is needed
#together <-together %>% group_by(datatype,as.factor(as.Date(timestamp))) %>% summarise(count=n())
#ggplot(together, aes(x=`as.factor(as.Date(timestamp))`,y=count,group=datatype,colour=datatype))+geom_line()
p_distr_notchanging_plot <-ggplot(together,aes(x=as.factor(as.Date(timestamp)),group=datatype,colour=datatype))+labs(xlab="",title="Requests per day")+theme_bw()+geom_line(size=1.25,stat="count")+theme(legend.text = element_text(size=12),legend.title = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom",plot.title=element_text(hjust=0.5),axis.title.x=element_blank(),panel.grid.minor = element_blank())
p_distr_notchanging_plot
One can see that the distribution is pretty similar between phase 1 and phase 2 filtration. Phase 1 - removing duplicated rows by uuid (red line) and Phase 2 - similar requests removed after removing duplicated rows by uuid.
modif_rate <- rateq_dup
modif_rate$timestamp <- as.Date(modif_rate$timestamp)
modif_rate$pickup_timestamp <- as.Date(modif_rate$pickup_timestamp)
modif_rate$return_timestamp <- as.Date(modif_rate$return_timestamp)
modif_rate$days_in_advance <- modif_rate$pickup_timestamp - modif_rate$timestamp
p_days_in_advance_dist <-ggplot(modif_rate,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+theme_bw()+
labs(x="Days in advance",title="Price checking")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_days_in_advance_dist
This plot shows how many days in advance customers are checking the prices. We can see that the most popular is first two weeks or three weeks. We can also see a slight bump near 200 days, which is logical because the rate quotes here have been made in december, so ~200 days ahead means checking prices for the summer that is the most popular vacation season. However since we are dealing with data from Iceland, one can see that winter times are very popular as well!
#In case there is more data and one would like to see the same plot within seasons
yq <- as.yearqtr(as.yearmon(modif_rate$timestamp, "%m/%d/%Y") + 1/12)
modif_rate$Season <- factor(format(yq, "%q"), levels = 1:4, labels = c("winter", "spring", "summer", "fall"))
p_by_season_facets <-ggplot(modif_rate,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+facet_wrap(~Season,nrow=2)+theme_bw()+labs(x="Days in advance",title="Price checking by season")+scale_x_continuous(breaks=c(seq(0,1000,50)))+theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_by_season_facets
modif_res <- reservations
modif_res$rate_request_timestamp <- as.Date(modif_res$rate_request_timestamp)
modif_res$pickup_timestamp <- as.Date(modif_res$pickup_timestamp)
modif_res$return_timestamp <- as.Date(modif_res$return_timestamp)
modif_res$days_in_advance <- modif_res$pickup_timestamp - modif_res$rate_request_timestamp
p_days_in_advance_dist_reserv <-ggplot(modif_res,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+theme_bw()+
labs(x="Days in advance",title="Reservations made")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_days_in_advance_dist_reserv
The plot for reservations made however is slightly different than the price checking plot. Let’s look at the days in advance distribution by seasons when the reservation requests were made.
#getting season in case needed
yq <- as.yearqtr(as.yearmon(modif_res$rate_request_timestamp, "%m/%d/%Y") + 1/12)
modif_res$Season <- factor(format(yq, "%q"), levels = 1:4, labels = c("WINTER", "SPRING", "SUMMER", "FALL"))
p_by_season_facets_reserv <-ggplot(modif_res,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+facet_wrap(~Season,nrow=2)+theme_bw()+labs(x="Days in advance",title="Reservations requests made by season")+scale_x_continuous(breaks=c(seq(0,1000,50)))+theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5),panel.grid.major.x=element_blank())
p_by_season_facets_reserv
One can notice that Winter and Fall are distributed very similarly. One can notice a bump in spring for 100-150 days advance, which means that these reservations are made for the summer season.
bybroker_data_req <-rateq_filtered %>% filter(Broker_name != "RateChain test")%>%group_by(Broker_name,as.factor(as.Date(timestamp))) %>% summarise(count=n())
bybroker_data_req$`as.factor(as.Date(timestamp))` <- format(as.Date(bybroker_data_req$`as.factor(as.Date(timestamp))`,"%Y-%m-%d"),format="%d. %b")
picked_colours <- brewer.pal(length(unique(bybroker_data_req$Broker_name)),"Spectral")
p_bybroker <- bybroker_data_req %>% plot_ly(x=~`as.factor(as.Date(timestamp))`,y=~count,type="bar",color=~Broker_name,colors=picked_colours) %>%
layout(barmode="stack",title="Rate quotes by brokers",xaxis=list(title="",tickangle=0),legend=list(y=0.5))
p_bybroker
#log_scale; but hover shows count not log10(count)
p_bybroker_logscale <- bybroker_data_req %>% plot_ly(x=~`as.factor(as.Date(timestamp))`,y=~log10(count),type="bar",color=~Broker_name,colors=picked_colours,hoverinfo="text",text=~paste0(Broker_name,":\t",count)) %>%
layout(barmode="stack",title="Requests by brokers",xaxis=list(title="",tickangle=0),legend=list(y=0.5))
#p_bybroker_logscale
TravelJigsaw has the most rate quotes, then BSP Auto and after that SupplierWebsite.
bybroker_data_res <- reservations %>% group_by(date=as.Date(rate_request_timestamp),`Broker name`) %>% summarise(count=n())
picked_colours <- brewer.pal(length(unique(bybroker_data_res$`Broker name`)),"Spectral")
p_bybroker_res <- bybroker_data_res %>% plot_ly(x=~date,y=~count,type="bar",color=~`Broker name`,colors=picked_colours) %>%
layout(barmode="stack",title="Reservations by brokers",xaxis=list(title="Reservation request date",tickangle=0,type="date"),legend=list(y=0.5))
p_bybroker_res
One can notice that TravelJigsaw still has the most reservations, but the second place goes to SupplierWebsite, not BSP Auto as it was in rate quote data. Seems like SupplierWebsite first appeared in July 2017 and covers a significant percentage of reservations.
#HOTDAYS-----------------
#Rate quotes
#by day
hotdays_rq_nodupl_brokers_byday <- hotdays_rq_nodupl_brokers_byday[,-6]#removing ratechain test
hotdays_rq_nodupl_brokers_byday$TravelJigsaw <- as.numeric(hotdays_rq_nodupl_brokers_byday$TravelJigsaw)
hotdays_rq_nodupl_brokers_byday <- melt(hotdays_rq_nodupl_brokers_byday)
hotdays_rq_nodupl_brokers_byday[is.na(hotdays_rq_nodupl_brokers_byday)] <- 0
picked_colours <- brewer.pal(length(unique(hotdays_rq_nodupl_brokers_byday$variable)),"Spectral")
p_hotdays_rq_brokers_byday <- hotdays_rq_nodupl_brokers_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>%
add_lines() %>% layout(title="Hot days (rate quotes) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of requests"))
p_hotdays_rq_brokers_byday
hotdays_rq_nodupl_brokers_byweek <- hotdays_rq_nodupl_brokers_byweek[,-6]#removing ratechain test
hotdays_rq_nodupl_brokers_byweek$TravelJigsaw <- as.numeric(hotdays_rq_nodupl_brokers_byweek$TravelJigsaw)
hotdays_rq_nodupl_brokers_byweek <- melt(hotdays_rq_nodupl_brokers_byweek)
hotdays_rq_nodupl_brokers_byweek[is.na(hotdays_rq_nodupl_brokers_byweek)] <- 0
picked_colours <- brewer.pal(length(unique(hotdays_rq_nodupl_brokers_byweek$variable)),"Spectral")
p_hotdays_rq_brokers_byweek <- hotdays_rq_nodupl_brokers_byweek %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>%
add_lines() %>% layout(title="Hot weeks (rate quotes) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of requests"))
p_hotdays_rq_brokers_byweek
hotdays_reserv_brokers_byday <- hotdays_reserv_brokers_byday[,-6]#removing ratechain test
hotdays_reserv_brokers_byday <- melt(hotdays_reserv_brokers_byday)
hotdays_reserv_brokers_byday[is.na(hotdays_reserv_brokers_byday)] <- 0
hotdays_reserv_brokers_byday <-hotdays_reserv_brokers_byday[as.Date(hotdays_reserv_brokers_byday$timestamp) >="2017-02-27",]
picked_colours <- brewer.pal(length(unique(hotdays_reserv_brokers_byday$variable)),"Spectral")
p_hotdays_reserv_brokers_byday <- hotdays_reserv_brokers_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>%
add_lines() %>% layout(title="Hot days (reservations) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of reservations"))
p_hotdays_reserv_brokers_byday
hotdays_reserv_acriss_byday <- melt(hotdays_reserv_acriss_byday)
hotdays_reserv_acriss_byday[is.na(hotdays_reserv_acriss_byday)] <- 0
picked_colours <- brewer.pal(length(unique(hotdays_reserv_acriss_byday$variable)),"Spectral")
hotdays_reserv_acriss_byday <-hotdays_reserv_acriss_byday[as.Date(hotdays_reserv_acriss_byday$timestamp) >="2017-02-27",]
p_hotdays_reserv_acriss_byday <- hotdays_reserv_acriss_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours,type="bar") %>%
layout(barmode="stack",title="Hot days (reservations) by car type",xaxis=list(title="",type="date"),legend=list(y=0.5,x=0.95),yaxis=list(title="count of reservations"))
p_hotdays_reserv_acriss_byday
#In order to save a html widget
#htmlwidgets::saveWidget(p_hotdays_reserv_acriss_byday,"hotdaysbycartype.html")
hotdays_reserv_statustype_byday <- melt(hotdays_reserv_statustype_byday)
hotdays_reserv_statustype_byday[is.na(hotdays_reserv_statustype_byday)] <- 0
hotdays_reserv_statustype_byday <-hotdays_reserv_statustype_byday[as.Date(hotdays_reserv_statustype_byday$timestamp) >="2017-02-27",]
picked_colours <- brewer.pal(length(unique(hotdays_reserv_statustype_byday$variable)),"Spectral")
p_hotdays_reserv_statustype_byday <- hotdays_reserv_statustype_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours,type="bar") %>%
layout(barmode="stack",title="Hot days (reservations) by status type",xaxis=list(title="",type="date"),yaxis=list(title="count of reservations"))
p_hotdays_reserv_statustype_byday
#Season and status type
p_by_season_status_type_facets_reserv <-ggplot(modif_res,aes(x=days_in_advance,group=reservation_status_type,fill=reservation_status_type))+geom_histogram(binwidth = 10,position = "stack",colour="black")+
facet_wrap(~Season,nrow=2)+scale_fill_manual(values=brewer.pal(5,"Spectral"))+theme_bw()+labs(x="Days in advance",title="Reservations made by season",fill="")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
theme(panel.grid.minor=element_blank(),legend.position = "bottom",plot.title=element_text(hjust=0.5),panel.grid.major.x=element_blank())
p_by_season_status_type_facets_reserv
First let’s look at the plot where the count of reservations and the count of rate quotes for the same day are shown. Note that in the particular dataset there are no days when the count of reservations is 0, however there are a couple of days when the count of rate quotes is smaller (and even 0) than the count of reservations. It means that the rate quotes were made before 1st of December in 2017.
Firstly, let’s look at rate quotes data, that has been gone through phase 1 (duplicates by uuid have been removed).
reservations$rate_request_timestamp <- as.Date(reservations$rate_request_timestamp)
reservations$pickup_timestamp <- as.Date(reservations$pickup_timestamp)
res1 <- reservations %>% select(rate_request_timestamp,pickup_timestamp,rental_days,`Broker name`)
res <- data.frame(rate_request_timestamp=as.Date(character()),
rented_day_timestamp=as.Date(character()),
Broker_name=character(),
uid = integer(),
stringsAsFactors=FALSE)
lengtht <- nrow(res1)#approx 8 minutes
for(i in 1:lengtht){
lengtht2 <- res1$rental_days[i]
for(j in 1:lengtht2){
temp <- res1[i,]
res[nrow(res)+1,] <- c(as.character(as.Date(temp$rate_request_timestamp)),as.character(as.Date(temp$pickup_timestamp) + j - 1),temp$`Broker name`,i)
}
}
hotdays_rq_nodupl_brokers_byday[is.na(hotdays_rq_nodupl_brokers_byday)] <- 0
rq <- melt(hotdays_rq_nodupl_brokers_byday)
rq$timestamp <- as.Date(rq$timestamp)
colnames(rq)[2] <- "Broker_name"
res <- res %>% filter(as.Date(rate_request_timestamp) >= "2017-12-01") %>%group_by(rented_day_timestamp,Broker_name) %>% summarise(count=n())
colnames(res)[1] <- "timestamp"
rq <- rq[rq$timestamp %in% res$timestamp,]
all <- merge(rq,res,by=c("timestamp","Broker_name"),all.y = TRUE)
all <- all %>% filter(Broker_name != "RateChain test")
print(paste("The data contains rate quotes and reservations to the following time frame:",min(as.Date(all$timestamp)),"to",max(as.Date(all$timestamp)),"."))
## [1] "The data contains rate quotes and reservations to the following time frame: 2017-12-01 to 2018-11-02 ."
test <- all %>% select(value,count,timestamp) %>% group_by(timestamp) %>%summarise(count=sum(count),value=sum(value))
plot1 <- ggplot(test,aes(x=count,y=value))+geom_point()+labs(x="count of reservations",y="count of rate quotes",title="Rate quotes and reservations relation on phase 1 filtrated data")+theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot1
One can see a relation between the two variables. Now let’s look at the relation when we are using rate quote data that has gone through phase 2 filtration.
rq <- melt(hotdays_rq_filtered)
hotdays_rq_filtered[is.na(hotdays_rq_filtered)] <- 0
rq$timestamp <- as.Date(rq$timestamp)
colnames(rq)[2] <- "Broker_name"
rq <- rq[rq$timestamp %in% res$timestamp,]
all <- merge(rq,res,by=c("timestamp","Broker_name"),all.y = TRUE)
all <- all %>% filter(Broker_name != "RateChain test")
test <- all %>% select(value,count,timestamp) %>% group_by(timestamp) %>%summarise(count=sum(count),value=sum(value))
plot2 <-ggplot(test,aes(x=count,y=value))+geom_point()+labs(x="count of reservations",y="count of rate quotes",title="Rate quotes and reservations relation on phase 2 filtrated data")+theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot2
We can see that the y axis values have decreased a lot, but the overall look of the relation is the same. So let’s continue using the filtered data.
plot3 <-all %>% plot_ly(x=~count,y=~value,color=~Broker_name,type="scatter") %>% layout(title="",xaxis=list(title="count of reservations"),yaxis=list(title="count of rate quotes"))
plot3
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
Now I can see that TravelJigsaw has the most rate quotes. Let’s calculate the rate quotes to reservation ratio.
all$ratio <- all$value/all$count
all_wo_travel_bsp <- all %>% filter(Broker_name != "TravelJigsaw", Broker_name != "BSP Auto")
plot4 <-ggplot(all_wo_travel_bsp,aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw() + theme(axis.text.x = element_text(angle = 90),plot.title=element_text(hjust=0.5))
plot4
plot5 <- ggplot(all %>% filter(Broker_name == "TravelJigsaw"),aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot5
plot6 <- ggplot(all %>% filter(Broker_name == "BSP Auto"),aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot6
TravelJigsaw has also the most outliers according to the boxplot.